"""
配置文件
存储了各种文件路径以及爬虫的各种配置项
"""

from random import choice

## 文件路径配置

# 初始文件路径，存储企业的名称，见 README-快速上手-第二步
# 只有一列，表头为'name'，内容为企业名称
csv_path = "source.csv"

# SQLite文件路径，存储企业的爬取状态
sql_path = "source.db"

# SQLite表名，定义语句如下：
# create table <table_name> {
#       name text,      -- 企业名称
#       done integer    -- 是否爬取，0为未爬取，1为已爬取
#       }
table_name = "company"

# 爬取结果的路径，见 README-快速上手-第四步
tmp_path = "tmp.csv"

# 日志文件的路径，存储爬取失败的公司名称及原因，见 README-快速上手-第四步
log_path = 'log.csv'

# 处理结果的路径, 见 README-快速上手-第五步
out_path = "out.xlsx"


## 爬虫配置

# 天眼查的cookies，保存登录信息，需要在浏览器里登录后从网页获取
t_cookies = []

# 企查查的cookies，同样需要登录后的cookie
q_cookies = []


# 天眼查的请求头
t_headers = {
    "Host": "www.tianyancha.com",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.tianyancha.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate, br,",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cookie": choice(t_cookies)}

# 企查查的请求头
q_headers = {
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cookie": choice(q_cookies)}

# 天眼查的查询网址
tyc_search = "https://www.tianyancha.com/search?key={}"

# 企查查的查询网址
qcc_search = "https://www.qcc.com/web/search?key={}"

# 爬虫结果的表头
heads = ["企业全称", "最新名称", "是否独角兽", "是否上市", "是否国高新", "参保人数", "A", "A以上", "B", "融资轮次", "融资额", "专利数量", "软著数量", "近一年是否有招聘",
         "上年度主营业务收入"]
